package org.mcclone.crawler;

import com.google.common.collect.Lists;
import org.apache.commons.collections4.ListUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

public class TrainCrawler {

    public static void main(String[] args) throws IOException {
        List<String> result = new LinkedList<>();
        List<String> allTrain = crawlerAllTrain();
        for (String trainNo : allTrain) {
            try {
                result.addAll(crawlerTrain2(trainNo));
            } catch (Exception ignored) {
            }
        }
        FileUtils.writeLines(new File("D:\\company\\火车时刻表-20181212-2.txt"), result);
    }

    //根据班次从火车网爬取火车时刻表
    private static List<String> crawlerTrain2(String trainNo) throws IOException {
        List<String> result = new LinkedList<>();
        Document document = Jsoup.connect("https://www.huoche.net/lieche/checi/?checi=" + trainNo + "&date=2018-12-13")
                .get();
        Elements elements = document.select(".timetable_cont table tr");
        for (int i = 1; i < elements.size(); i++) {
            Element element = elements.get(i);
            Elements td = element.select("td");
            String index = td.get(0).select("em").first().text();
            String station = td.get(1).selectFirst("a").text();
            Elements time = td.get(2).select("p");
            String timeStr = time.get(0).text() + "\t" + time.get(1).text();
            String line = trainNo + "\t" + index + "\t" + station + "\t" + timeStr;
            System.out.println(line);
            result.add(line);
        }
        return result;
    }

    //根据班次从携程爬取火车时刻表
    private static List<String> crawlerTrain(String trainNo) throws IOException {
        List<String> result = new LinkedList<>();
        Jsoup.connect("http://trains.ctrip.com/trainbooking/TrainSchedule/" + trainNo)
                .get()
                .select("#ctl00_MainContentPlaceHolder_pnlResult .tb_result tbody")
                .get(1)
                .children()
                .forEach(element -> {
                    Elements tds = element.children();
                    List<String> row = new ArrayList<>();
                    for (Element td : tds) {
                        row.add(td.text());
                    }
                    List<String> data = ListUtils.union(Lists.newArrayList(trainNo), row.subList(1, 5));
                    result.add(StringUtils.join(data, "\t"));
                });
        return result;
    }

    /**
     * 获取所有火车班次
     *
     * @return
     * @throws IOException
     */
    private static List<String> crawlerAllTrain() throws IOException {
        List<String> trains = new ArrayList<>();
        for (int i = 9; i <= 17; i++) {
            String url = "https://www.huoche.net/lieche/checi";
            Elements elements = Jsoup.connect(url + i + "/")
                    .get().select(".grayline .z-z li a");
            for (Element element : elements) {
                String text = element.text();
                trains.add(text);
            }
        }
        return trains;
    }
}
